import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import statsmodels.api as sm
# load US wealth and income distribution data
perc_wealth = pd.read_csv('data//percent_of_total_wealth.csv', index_col = 0)
perc_wealth['Top 10%'] = perc_wealth['Top 1%'] + perc_wealth['90th-99th']
perc_wealth.tail()
gini_data = pd.read_csv('data//household_gini.csv')
perc_wealth.head()
gini_data.head()
fig = px.line(perc_wealth, x=perc_wealth.index, y=['Top 10%','50th - 90th', 'Bottom 50%'],
title = 'Shares of Total Net Worth Over Time',
labels = { 'DATE': 'Date', 'value':'Percent of Total Net Worth', 'variable':'Wealth Percentile'})
fig.update_layout(hovermode="x")
fig.show()
# stacked bar chart form of above
fig2 = go.Figure(data=[
go.Bar(name='Top 10%', x=perc_wealth.index, y=perc_wealth['Top 10%']),
go.Bar(name='50th - 90th Percentile', x=perc_wealth.index, y=perc_wealth['50th - 90th']),
go.Bar(name='Bottom 50%', x=perc_wealth.index, y=perc_wealth['Bottom 50%'])
])
fig2.update_layout(yaxis=dict(
title_text="% of Total Net Worth",
ticktext=["0%", "20%", "40%", "60%","80%","100%"],
tickvals=[0, 20, 40, 60, 80, 100],
tickmode="array",
titlefont=dict(size=15),
),
title = 'Shares of Total Net Worth Over Time',
barmode='stack')
fig2.show()
Top 10% of wealth holders have been consistently increasing their share of total wealth from 60.3% in Q3 1989 to 69.6% Q3 2021.
Interestingly, while the middle 40% lost their share of wealth, the bottom 50% saw a steep increase in their share beginning in Q1 2021.
fig_bot = px.line(perc_wealth, x=perc_wealth.index, y='Bottom 50%',
title = 'Share of Wealth held by Bottom 50%',
labels = { 'DATE': 'Date', 'Bottom 50%':'Percent of Total Net Worth'})
fig_bot.update_layout(hovermode="x")
fig_bot.show()
fig3 = px.line(gini_data, x='DATE', y='GINI',
title = 'Household Income Gini Ratio',
labels = { 'DATE': 'Date', 'GINI':'Gini Ratio'})
fig3.update_layout(hovermode="x")
fig3.show()
Since 1968, inequality in income has continued to rise.
# load OECD and UN economic, QOL data
productivity = pd.read_csv('data//GDP_per_labor_hr.csv')
oecd_gini = pd.read_csv('data//oecd_gini.csv')
oecd_tax_rev = pd.read_csv('data//oecd_tax_rev_perGDP.csv')
happiness = pd.read_csv('data//whr_data.csv')
hdi = pd.read_csv('data//hdi.csv', index_col =0)
# merge OECD data into one dataframe
oecd_combined_data = oecd_gini.merge(oecd_tax_rev, how = 'left', on = ['LOCATION','TIME'])
oecd_combined_data = oecd_combined_data.merge(productivity, how = 'left', on = ['LOCATION','TIME'])
oecd_combined_data = oecd_combined_data[['LOCATION', 'TIME', 'Value_x', 'Value_y','Value']]
oecd_combined_data = oecd_combined_data.rename(columns = {'Value_x':'Gini','Value_y':'Tax Revenue (% of GDP)', 'Value':'Productivity (GDP/Labor Hr)'})
oecd_combined_data.head()
# prepare World Happiness Report Data
happiness = happiness[['Country name','year','Life Ladder']]
happiness = happiness.rename(columns={'Life Ladder':'Happiness Score'})
happiness = happiness[happiness['year']==2018]
happiness.head()
hdi.head()
# combine HDI and WHR data from UN
un_combined_data = happiness.merge(hdi, how = 'inner', left_on = 'Country name', right_on = 'Country')
un_combined_data = un_combined_data.drop(columns = ['Country'])
un_combined_data.head()
# select only OECD from 2018 since hdi data is only complete for 2018
oecd_combined_data = oecd_combined_data[oecd_combined_data['TIME']==2018]
# convert OECD abbreviations into UN-format country names
country_abbr = {'AUS':'Australia','AUT':'Austria','BEL':'Belgium','CAN':'Canada','CZE':'Czech Republic',
'DNK':'Denmark','FRA':'France','DEU':'Germany','GRC':'Greece','HUN':'Hungary','LUX':'Luxembourg',
'IRL':'Ireland','ITA':'Italy','NLD':'Netherlands','NOR':'Norway','PRT':'Portugal','SVK':'Slovakia','ESP':'Spain',
'SWE':'Sweden','GBR':'United Kingdom','USA':'United States','EST':'Estonia','SVN':'Slovenia',
'JPN':'Japan','KOR':'South Korea','MEX':'Mexico','LVA':'Latvia','LTU':'Lithuania','CRI':'Costa Rica',
'CHE':'Switzerland','TUR':'Turkey','ISR':'Israel','SVN':'Slovenia','POL':'Poland','FIN':'Finland'}
oecd_combined_data = oecd_combined_data.replace(country_abbr)
oecd_combined_data
# combine oecd and un dataframes into one
final_dataset = oecd_combined_data.merge(un_combined_data, how = 'left', left_on = 'LOCATION', right_on = 'Country name')
final_dataset = final_dataset.drop(columns = ['Country name','year'])
final_dataset = final_dataset.dropna()
final_dataset
# create scatter plot matrix to examine associations between predictors
fig4 = px.scatter_matrix(final_dataset,
dimensions=['Gini', 'Tax Revenue (% of GDP)',
'Productivity (GDP/Labor Hr)', 'Happiness Score', 'HDI (2018)'], height=1500, width=1500)
fig4.update_traces(diagonal_visible=False)
fig4.show()
Gini appears to have the strongest linear associations with tax revenue and HDI.
# plot Tax Revenue and HDI specifically
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig5 = make_subplots(rows=1, cols=2)
fig5.add_trace(
go.Scatter(y=final_dataset['Gini'], x=final_dataset['Tax Revenue (% of GDP)'], text = final_dataset['LOCATION'],
mode="markers"),
row=1, col=1
)
fig5.add_trace(
go.Scatter(y=final_dataset['Gini'], x=final_dataset['HDI (2018)'], text = final_dataset['LOCATION'], mode="markers"),
row=1, col=2
)
fig5.update_xaxes(title_text="Tax Revenue (% of GDP)", row=1, col=1)
fig5.update_xaxes(title_text="HDI (2018)", row=1, col=2)
fig5.update_yaxes(title_text="Gini Ratio", row=1, col=1)
fig5.update_yaxes(title_text="Gini Ratio", row=1, col=2)
fig5.update_layout(height=600, width=1000, showlegend=False)
fig5.show()
We look for a model where all predictors have p-value below the threshold $\alpha = 0.05$
# initialize and fit model containing all predictors, then remove predictor with highest p-value above alpha = .05
# to create next model
y = final_dataset['Gini']
X1 = final_dataset[['Tax Revenue (% of GDP)','Productivity (GDP/Labor Hr)', 'Happiness Score', 'HDI (2018)']]
X1 = sm.add_constant(X1)
full_model = sm.OLS(y, X1)
results = full_model.fit()
print(results.summary())
X2 = final_dataset[['Tax Revenue (% of GDP)', 'Productivity (GDP/Labor Hr)', 'HDI (2018)']]
X2 = sm.add_constant(X2)
model1 = sm.OLS(y, X2)
results1 = model1.fit()
print(results1.summary())
X3 = final_dataset[['Tax Revenue (% of GDP)', 'HDI (2018)']]
X3 = sm.add_constant(X3)
model2 = sm.OLS(y, X3)
results2 = model2.fit()
print(results2.summary())
for the model: $$\widehat{\text{Gini}} = 0.7763 -.0041 \cdot \text{Tax Revenue (% of GDP)} -.03541 \cdot \text{HDI}$$
the coefficients have P-values below the threshold $\alpha = 0.05$, so we reject the null hypothesis that the coefficients are equal to zero. In addition, the F-test statistic is in the rejection region, so there is evidence that at least one of the predictors affects $\text{Gini}$.
We also have $R_\text{adj}^2 = .522$, so approximately 52% of the variation in $\text{Gini}$ can be accounted for by the explanatory variables.
fig6 = plt.figure(figsize=(9,6))
fig6 = sm.graphics.plot_regress_exog(results2, 'Tax Revenue (% of GDP)', fig=fig6)
fig7 = plt.figure(figsize=(9,6))
fig7 = sm.graphics.plot_regress_exog(results2, 'HDI (2018)', fig=fig7)
px.histogram(x=results2.resid, labels = {'x':'Residuals'}, title = 'Histogram of Model Residuals')
npp = sm.ProbPlot(results2.resid)
fig8 = npp.qqplot(line='s')
plt.title('Normal Probability Plot')
plt.show()
from scipy import stats
stats.shapiro(results2.resid)
$p = 0.44 > .05$ so we fail to reject null hypothesis. There is not enough evidence to suggest that the residuals are not normally distributed.
Neither of the residual plots show any clear pattern or clear signs of heteroscedasticity. In addition, the residuals are normally distributed so the linear model is appropriate.